package org.spider.core.utils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.xsoup.XElements;
import us.codecraft.xsoup.Xsoup;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 抽取数据工具类
 */
public class ExtractUtils {

	private static Map<String, Pattern> patterns = new HashMap<>();

	private static Pattern compile(String regx){
		Pattern pattern = patterns.get(regx);
		if(pattern == null){
			pattern = Pattern.compile(regx,Pattern.DOTALL);
			patterns.put(regx, pattern);
		}
		return pattern;
	}

	public static List<String> getValuesByXPath(Element element,String xpath){
		return Xsoup.select(element,xpath).list();
	}

	public static List<String> getValuesByXPath(Elements elements,String xpath){
		return Xsoup.select(elements.html(),xpath).list();
	}

	public static String getValueByXPath(Element element,String xpath){
		return Xsoup.select(element,xpath).get();
	}

	public static String getValueByXPath(Elements elements,String xpath){
		return Xsoup.select(elements.html(),xpath).get();
	}

	public static String getAllTextByXpath(Element element,String xpath){
		XElements xElements = Xsoup.select(element, xpath);
		Elements elements = xElements.getElements();
		StringBuilder text= new StringBuilder();
//		System.out.println(elements.size());
		for(Element e:elements){
			text.append(e.wholeText());
		}
		return text.toString().replace("\n","");
	}

	public static String getFirstMatcher(String content,String regx,boolean isGroup){
//		System.out.println(content+"  regx=>" +regx);
		return getFirstMatcher(content,regx,isGroup ? 1 : 0);
	}

	public static String getFirstMatcher(String content,String regx,int groupIndex){
		Matcher matcher = compile(regx).matcher(content);
		if(matcher.find()){
			return matcher.group(groupIndex);
		}
		return null;
	}

	public static void main(String[] args) {
		String html = "<div id=\"example\">这是一个文本。<p>这是在p标签中的文本。</p>这是更多的文本。</div>";
		Element element = Jsoup.parse(html);
		System.out.println(element.text());
		System.out.println(element.wholeText());
	}
}
